Mulivariate Prophet Model

Predicting COVID-19 Deaths (STAT 390)

Author

Ryan Nguyen

1. Import Libraries & Set Seed

Code
library(forecast)
library(tseries)
library(tidyverse)
library(tidymodels)
library(kableExtra)
library(modeltime)
library(prophet)
library(timetk)
library(Metrics)
library(zoo)
library(MLmetrics)

set.seed(123)

2. Pull Data

Code
selected_deaths <- read_csv("data/finaldata_selectedfeatures.csv")

3. Data Processing

Code
selected_deaths <- selected_deaths  %>% 
  rename(ds = date,
         y = covid_19_deaths)

# LOCF 
selected_deaths <- fill(selected_deaths, everything())

selected_deaths <- replace(selected_deaths, is.na(selected_deaths), 0)

naniar::miss_var_summary(selected_deaths)
# A tibble: 93 × 3
   variable                  n_miss pct_miss
   <chr>                      <int>    <dbl>
 1 ds                             0        0
 2 dayofweek                      0        0
 3 quarter                        0        0
 4 month                          0        0
 5 dayofyear                      0        0
 6 dayofmonth                     0        0
 7 weekofyear                     0        0
 8 deaths_half_year_lag           0        0
 9 deaths_1_year_lag              0        0
10 deaths_1_andhalf_year_lag      0        0
# ℹ 83 more rows
Code
selected_deaths <- selected_deaths %>% 
  mutate(y = case_when((y == 0) ~ 1,
                       .default = y))

selected_deaths <- selected_deaths %>% 
  mutate(y = log(y))

4. Train Test Split

Code
#splits <- time_series_split(selected_deaths, initial = "148 weeks", assess = "17 weeks")
#train_data <- training(splits)
#test_data <- testing(splits)

train_data <- selected_deaths %>% 
  filter(year != 2023)

test_data <- selected_deaths %>% 
  filter(year == 2023)

5. Baseline Model

5.1 Build Baseline Model

Code
# Use default parameters to initiate Prophet Model
# Fit model on training set
baseline_model <- prophet(train_data)

5.2 Baseline Model Forecast

Code
# Create time range for forecast
future_baseline <- make_future_dataframe(baseline_model, freq = "week", periods = 8)

# Make prediction
forecast_baseline <- predict(baseline_model, future_baseline)

# Visualize the forecast
plot(baseline_model, forecast_baseline) +
  labs(title = "Baseline Prophet Model")

Code
prophet_plot_components(baseline_model, forecast_baseline)

5.3 Baseline Model Performance

Code
# Merge actual and predicted values
forecast_baseline2 <- forecast_baseline %>%
  filter(ds > as.POSIXct("2023-01-01"))

performance_baseline <- full_join(test_data, forecast_baseline2, by = join_by(ds == ds)) %>% 
  select(ds, y, yhat)

performance_baseline <- performance_baseline %>% 
  mutate(y = exp(y),
         yhat = exp(yhat))

ggplot(performance_baseline) +
  geom_line(aes(x = ds, y = y, color = "red")) +
  geom_point(aes(x = ds, y = y, color = "red")) +
  geom_line(aes(x = ds, y = yhat, color = "blue")) +
  geom_point(aes(x = ds, y = yhat, color = "blue")) +
  scale_color_manual(name="Value", values = c("red", "blue"), labels = c("Predicted", "Actual")) +
  labs(title = "Baseline Prophet Test Set", 
       y = "Deaths",
       x = "Dates")

Code
# Check MAE value
baseline_mae <- mae(performance_baseline$y, performance_baseline$yhat)
print(paste("The MAE for the baseline model is", baseline_mae))
[1] "The MAE for the baseline model is 43.6011289138982"
Code
# Check MASE value
baseline_mase <- mase(performance_baseline$y, performance_baseline$yhat)
print(paste("The MASE for the baseline model is", baseline_mase))
[1] "The MASE for the baseline model is 0.713818066599086"

6. Tuning Changepoints

6.1 Plot Default Changepoints

Code
plot(baseline_model, forecast_baseline) + add_changepoints_to_plot(baseline_model)

Code
# Create prophet model with CI of 95%
model_changepoint <- prophet(train_data, interval.width = 0.95, n.changepoints = 6)

6.2 New Changepoints Model Forecast

Code
# Create time range for forecast
future_changepoint <- make_future_dataframe(model_changepoint, freq = "week", periods = 8)

# Make prediction
forecast_changepoint <- predict(model_changepoint, future_changepoint)

# Visualize the forecast
plot(model_changepoint, forecast_changepoint) +
  labs(title = "Changepoint Prophet Model")

Code
# Visualize forecast components
prophet_plot_components(model_changepoint, forecast_changepoint)

Code
# Change points to plot
plot(model_changepoint, forecast_changepoint) + add_changepoints_to_plot(model_changepoint)

Code
# Merge actual and predicted values
forecast_changepoint2 <- forecast_changepoint %>%
  filter(ds > as.POSIXct("2023-01-01"))

performance_changepoint <- full_join(test_data, forecast_changepoint2, by = join_by(ds == ds)) %>% 
  select(ds, y, yhat)

performance_changepoint <- performance_changepoint %>% 
  mutate(y = exp(y),
         yhat = exp(yhat))

ggplot(performance_changepoint) +
  geom_line(aes(x = ds, y = y, color = "red")) +
  geom_point(aes(x = ds, y = y, color = "red")) +
  geom_line(aes(x = ds, y = yhat, color = "blue")) +
  geom_point(aes(x = ds, y = yhat, color = "blue")) +
  scale_color_manual(name="Value", values = c("red", "blue"), labels = c("Predicted", "Actual")) +
  labs(title = "Changepoint Prophet Test Set", 
       y = "Deaths",
       x = "Dates") 

Code
# Check MAE value
changepoint_mae <- mae(performance_changepoint$y, performance_changepoint$yhat)
print(paste("The MAE for the changepoint model is", changepoint_mae))
[1] "The MAE for the changepoint model is 43.3383765752917"
Code
# Check MASE value
changepoint_mase <- mase(performance_changepoint$y, performance_changepoint$yhat)
print(paste("The MASE for the changepoint model is", changepoint_mase))
[1] "The MASE for the changepoint model is 0.70951640352268"

7. Add Seasonality to Baseline Model

7.1 Build Model with Seasonality

Code
# Add seasonality and fit to training set
model_season <- prophet(train_data,
                        yearly.seasonality = TRUE,
                        weekly.seasonality = TRUE)

7.2 Seasonality Model Forecast

Code
# Create time range for forecast
future_season <- make_future_dataframe(model_season, freq = "week", periods = 8)

# Make prediction
forecast_season <- predict(model_season, future_season)

# Visualize the forecast
plot(model_season, forecast_season) +
  labs(title = "Seasonal Prophet Model")

Code
prophet_plot_components(model_season, forecast_season)

7.3 Seasonality Model Performance

Code
# Merge actual and predicted values
forecast_season2 <- forecast_season %>%
  filter(ds > as.POSIXct("2023-01-01"))

performance_season <- full_join(test_data, forecast_season2, by = join_by(ds == ds)) %>% 
  select(ds, y, yhat)

performance_season <- performance_season %>% 
  mutate(y = exp(y),
         yhat = exp(yhat))

ggplot(performance_season) +
 geom_line(aes(x = ds, y = y, color = "red")) +
  geom_point(aes(x = ds, y = y, color = "red")) +
  geom_line(aes(x = ds, y = yhat, color = "blue")) +
  geom_point(aes(x = ds, y = yhat, color = "blue")) +
  scale_color_manual(name="Value", values = c("red", "blue"), labels = c("Predicted", "Actual")) +
  labs(title = "Seasonality Prophet Test Set", 
       y = "Deaths",
       x = "Dates") 

Code
# Check MAE value
season_mae <- mae(performance_season$y, performance_season$yhat)
print(paste("The MAE for the seasonality model is", season_mae))
[1] "The MAE for the seasonality model is 43.6300444474412"
Code
# Check MASE value
season_mase <- mase(performance_season$y, performance_season$yhat)
print(paste("The MASE for the seasonality model is", season_mase))
[1] "The MASE for the seasonality model is 0.714291458705265"

8. Multivariate Model

8.1 Build the Multivariate Model

Code
train_data2 <- train_data %>% 
  select(-c(ds, y))

features <- colnames(train_data2)

# Add seasonality
model_multivariate <- prophet(train_data, fit = FALSE)

# Add all regressors
for (name in features){
   model_multivariate <- add_regressor(model_multivariate, name)
}

# Fit on training set
multi_fit <- fit.prophet(model_multivariate, train_data)

8.2 Multivariate Model Forecast

Code
# Create time range for forecast
future_regressor <- make_future_dataframe(multi_fit, freq = "week", periods = 8)

# Append the regressor values
future_regressor <- full_join(future_regressor, train_data, by = join_by(ds == ds))


# Make prediction
forecast_regressor <- predict(multi_fit, selected_deaths)

# Visualize the forecast
plot(multi_fit, forecast_regressor) +
  labs(title = "Regressor Prophet Model")

Code
prophet_plot_components(multi_fit, forecast_regressor)

8.3 Multivariate Model Performance

Code
# Merge actual and predicted values
forecast_regressor2 <- forecast_regressor %>%
  filter(ds > as.POSIXct("2023-01-01")) %>% 
  select(ds, yhat) %>% 
  mutate(index = seq(1:332))

test_data <- test_data %>% 
  mutate(index = seq(1:332))

performance_regressor <- full_join(test_data, forecast_regressor2, by = join_by(index == index, ds == ds)) %>% 
  select(ds, y, yhat)

performance_regressor <- performance_regressor %>% 
  mutate(y = exp(y),
         yhat = exp(yhat))

# Plotting Test
ggplot(performance_regressor) +
  geom_line(aes(x = ds, y = y, color = "red")) +
  geom_point(aes(x = ds, y = y, color = "red")) +
  geom_line(aes(x = ds, y = yhat, color = "blue")) +
  geom_point(aes(x = ds, y = yhat, color = "blue")) +
  scale_color_manual(name="Value", values = c("red", "blue"), labels = c("Predicted", "Actual")) +
  labs(title = "Regressor Prophet Test Set", 
       y = "Deaths",
       x = "Dates") 

Code
# Check MAE value
regressor_mae <- mae(performance_regressor$y, performance_regressor$yhat)
print(paste("The MAE for the regressor model is", regressor_mae))
[1] "The MAE for the regressor model is 33.4020188794948"
Code
# Check MASE value
regressor_mase <- mase(performance_regressor$y, performance_regressor$yhat)
print(paste("The MASE for the regressor model is", regressor_mase))
[1] "The MASE for the regressor model is 0.546842825655989"
Code
# Check MAPE value
regressor_mape <- MAPE(performance_regressor$yhat, performance_regressor$y)
print(paste("The MAPE for the regressor model is", regressor_mape))
[1] "The MAPE for the regressor model is 0.493771272925121"

9. Holiday and Event Effect

9.1 Build Model with Holiday and Event

Code
events <- tribble(
  ~holiday, ~ds, ~lower_window, ~upper_window,
  #--|--|----
  "COVID", as.Date('2020-03-14'), -15, 15,
  "superbowl", as.Date('2020-02-02'), -7, 7,
  "superbowl", as.Date('2021-02-07'), -7, 7,
  "superbowl", as.Date('2022-02-13'), -7, 7,
  "superbowl", as.Date('2020-02-12'), -7, 7,
)

events
# A tibble: 5 × 4
  holiday   ds         lower_window upper_window
  <chr>     <date>            <dbl>        <dbl>
1 COVID     2020-03-14          -15           15
2 superbowl 2020-02-02           -7            7
3 superbowl 2021-02-07           -7            7
4 superbowl 2022-02-13           -7            7
5 superbowl 2020-02-12           -7            7
Code
# Add holidays 
model_holiday <- prophet(train_data, holidays = events, n.changepoints = 6, fit = FALSE)

# Add built-in country-specific holidays
model_holiday <- add_country_holidays(model_holiday, country_name = 'US')

# Fit model on training
holiday_fit <- fit.prophet(model_holiday, train_data)

9.2 Holiday Model Forecast

Code
# Create time range for forecast
future_holiday <- make_future_dataframe(holiday_fit, freq = "week", periods = 8)

# Append the regressor values
future_holiday <- left_join(future_holiday, test_data, by = join_by(ds == ds))

# Make prediction
forecast_holiday <- predict(holiday_fit, future_holiday)

# Visualize the forecast
plot(holiday_fit, forecast_holiday) +
  labs(title = "Holiday Prophet Model")

Code
# Visualize the forecast components
prophet_plot_components(holiday_fit, forecast_holiday)

9.3 Holiday Model Performance

Code
# Merge actual and predicted values
forecast_holiday2 <- forecast_holiday %>%
  filter(ds > as.POSIXct("2023-01-01")) %>%
  select(ds, yhat) %>% 
  mutate(index = seq(1:332))

test_data <- test_data %>% 
  mutate(index = seq(1:332))

performance_holiday <- full_join(test_data, forecast_holiday2, by = join_by(ds == ds, index == index)) %>% 
  select(ds, y, yhat)

performance_holiday <- performance_holiday %>% 
  mutate(y = exp(y),
         yhat = exp(yhat))

ggplot(performance_holiday) +
  geom_line(aes(x = ds, y = y, color = "red")) +
  geom_point(aes(x = ds, y = y, color = "red")) +
  geom_line(aes(x = ds, y = yhat, color = "blue")) +
  geom_point(aes(x = ds, y = yhat, color = "blue")) +
  scale_color_manual(name="Value", values = c("red", "blue"), labels = c("Predicted", "Actual")) +
  labs(title = "Holiday Prophet Test Set", 
       y = "Deaths",
       x = "Dates") 

Code
# Check MAE value
holiday_mae <- mae(performance_holiday$y, performance_holiday$yhat)
print(paste("The MAE for the holiday model is", holiday_mae))
[1] "The MAE for the holiday model is 42.0070113061772"
Code
# Check MASE value
holiday_mase <- mase(performance_holiday$y, performance_holiday$yhat)
print(paste("The MASE for the holiday model is", holiday_mase))
[1] "The MASE for the holiday model is 0.687719890312823"
Code
# Check MAPE value
# holiday_mape <- MAPE(performance_holiday$yhat, performance_holiday$y)
# print(paste("The MAPE for the holiday model is", holiday_mape))

10 Putting Best Parameters Together

Code
# Adding seasonality, holidays, changepoints
best_prophet <- prophet(train_data,
                        yearly.seasonality = TRUE,
                        weekly.seasonality = TRUE,
                        holidays = events,
                        #n.changepoints = 6,
                        fit = FALSE)

best_prophet <- add_country_holidays(best_prophet, country_name = 'US')

# Adding features
for (name in features){
   best_prophet <- add_regressor(best_prophet, name)
}

# Fit model
best_fit <- fit.prophet(best_prophet, train_data)

10.1 Forecasting Best Parameters

Code
# Create time range for forecast
future_best <- make_future_dataframe(best_fit, freq = "week", periods = 8)

# Append the regressor values
future_best <- left_join(future_best, train_data, by = join_by(ds == ds))

# Make prediction
forecast_best <- predict(best_fit, test_data)

# Visualize the forecast
#plot(best_fit, forecast_best) +
 # labs(title = "Best Prophet Model")

10.2 Best Parameters Performance

Code
# Merge actual and predicted values
forecast_best2 <- forecast_best %>%
  filter(ds > as.POSIXct("2023-01-01")) %>% 
  select(ds, yhat) %>% 
  mutate(index = seq(1:332))

test_data <- test_data %>% 
  mutate(index = seq(1:332))

performance_best <- full_join(test_data, forecast_best2, by = join_by(index == index, ds == ds)) %>%
  select(ds, y, yhat)

performance_best <- performance_best %>% 
  mutate(y = exp(y),
         yhat = exp(yhat))

best_test_plot <- ggplot(performance_best) +
  geom_line(aes(x = ds, y = y, color = "red")) +
  geom_point(aes(x = ds, y = y, color = "red")) +
  geom_line(aes(x = ds, y = yhat, color = "blue")) +
  geom_point(aes(x = ds, y = yhat, color = "blue")) +
  scale_color_manual(name="Value", values = c("red", "blue"), labels = c("Predicted", "Actual")) +
  labs(title = "Best Prophet Test Set", 
       y = "Deaths",
       x = "Dates") 
best_test_plot

Code
# Check MAE value
best_mae <- mae(performance_best$y, performance_best$yhat)
print(paste("The MAE for the best model is", best_mae))
[1] "The MAE for the best model is 32.4825511759144"
Code
# Check MASE value
best_mase <- mase(performance_best$y, performance_best$yhat)
print(paste("The MASE for the best model is", best_mase))
[1] "The MASE for the best model is 0.531789714077933"
Code
# Check MAPE value
best_mape <- MAPE(performance_best$yhat, performance_best$y)
print(paste("The MAPE for the best model is", best_mape))
[1] "The MAPE for the best model is 0.462532528122932"
Code
# Printing All of them together
results <- tribble(
  ~model, ~mae, ~mase,
  #--|--|----
  "baseline", baseline_mae, baseline_mase,
  "changepoint", changepoint_mae, changepoint_mase,
  "holiday", holiday_mae, holiday_mase,
  "regressors", regressor_mae, regressor_mase,
  "season", season_mae, season_mase,
  "best", best_mae, best_mase
) %>% 
  kbl() %>% 
  kable_styling()

results
model mae mase
baseline 43.60113 0.7138181
changepoint 43.33838 0.7095164
holiday 42.00701 0.6877199
regressors 33.40202 0.5468428
season 43.63004 0.7142915
best 32.48255 0.5317897
Code
# save best model
save(results, best_fit, performance_best, forecast_best, best_mase, best_mae,
     file = "results/multi_prophet.rda")